# Load necessary libraries
library(tidyverse)  # For data manipulation and visualization
library(sf)
library(did)

# Load the data
load("00_data/01_data_processed/epa_empl.Rdata") 

# Define EPA treatment regions
EPA_TREAT <- c("01", "02", "03", "04", "05", "07")

# Map states to EPA regions
state_to_region <- list(
  `01` = c("CONNECTICUT", "MAINE", "MASSACHUSETTS", "NEW HAMPSHIRE", "RHODE ISLAND", "VERMONT"),
  `02` = c("NEW JERSEY", "NEW YORK", "PUERTO RICO", "U.S. VIRGIN ISLANDS"),
  `03` = c("DELAWARE", "DISTRICT OF COLUMBIA", "MARYLAND", "PENNSYLVANIA", "VIRGINIA", "WEST VIRGINIA"),
  `04` = c("ALABAMA", "FLORIDA", "GEORGIA", "KENTUCKY", "MISSISSIPPI", "NORTH CAROLINA", "SOUTH CAROLINA", "TENNESSEE"),
  `05` = c("ILLINOIS", "INDIANA", "MICHIGAN", "MINNESOTA", "OHIO", "WISCONSIN"),
  `06` = c("ARKANSAS", "LOUISIANA", "NEW MEXICO", "OKLAHOMA", "TEXAS"),
  `07` = c("IOWA", "KANSAS", "MISSOURI", "NEBRASKA"),
  `08` = c("COLORADO", "MONTANA", "NORTH DAKOTA", "SOUTH DAKOTA", "UTAH", "WYOMING"),
  `09` = c("ARIZONA", "CALIFORNIA", "HAWAII", "NEVADA", "AMERICAN SAMOA", "GUAM", "NORTHERN MARIANA ISLANDS"),
  `10` = c("ALASKA", "IDAHO", "OREGON", "WASHINGTON")
)

# Function to find the EPA region for a given state
find_region <- function(state_name) {
  for (region in names(state_to_region)) {
    if (state_name %in% state_to_region[[region]]) {
      return(region)
    }
  }
  return(NA)  # Return NA if the state name does not match any region
}

# Summarize EPA employment data by state and year
EPA_EMPL_COUNTY <- EPA_EMPL_COUNTY %>% 
  filter(occupation %in% c("0028", "0819", "1301", "0801")) %>% 
  group_by(State, year) %>% 
  summarize(
    number = n(),
    median_grade = median(grade_numeric, na.rm = TRUE),
    median_edu = median(education_numeric, na.rm = TRUE),
    mean_grade = mean(grade_numeric, na.rm = TRUE),
    mean_edu = mean(education_numeric, na.rm = TRUE)
  )


# Assign EPA regions to each entry
EPA_EMPL_COUNTY$epa_region <- sapply(EPA_EMPL_COUNTY$State, find_region)

region_employee <- EPA_EMPL_COUNTY %>%
  filter(year == 1994) %>% 
  group_by(epa_region) %>%
  summarize(empl = sum(number)) %>% 
  filter(epa_region %in% EPA_TREAT)
            
######

# Load necessary data
load("00_data/01_data_processed/data_epa_analysis.Rdata")

# Preparing the data for modeling

# Set treatment year for analysis
epa_only$treatment_year_reg <- ifelse(epa_only$treatment_region == 0, 0, 1995)
epa_only$treatment_year_sta <- ifelse(epa_only$treatment_state == 0, 0, 1995)

# Convert PGM_SYS_ID to a numeric identifier
epa_only$PGM_SYS_ID_num <- as.numeric(as.factor(epa_only$PGM_SYS_ID))

# Define the categorization for each region

region_categories <- c(
  "01" = "Medium", # Employees: 1430 == LOW ;    TASK = LOW
  "02" = "High",   # Employees: 1797 == MEDIUM ; TASK = LOW
  "03" = "High",   # Employees: 6889 == HIGH ;   TASK = MEDIUM
  "04" = "Medium", # Employees: 3129 == MEDIUM ; TASK = MEDIUM
  "05" = "Low",    # Employees: 3049 == MEDIUM ; TASK = HIGH
  "07" = "Low"     # Employees: 872 == LOW ;     TASK = MEDIUM
)

# Define the set of categories to loop over
categories <- c("Low", "Medium", "High")

# Create a container (list) to hold the dynamic effect results for each category
category_results <- list()

# Loop over each category
for (cat in categories) {
  
  # Get the region codes corresponding to the current category
  region_codes <- names(region_categories)[region_categories == cat]
  
  # --- Subset the data for the current category ---
  # Include observations whose EPA_REGION is in the current category OR are never treated (control)
  cat_data <- epa_only %>%
    filter((EPA_REGION %in% region_codes | treatment_year_reg == 0) &
             year < 2002 & year > 1989)
  
  # --- Run the DID analysis on this subset ---
  model_cat <- att_gt(
    yname  = "outcome_epa",
    tname  = "year",
    idname = "PGM_SYS_ID_num",
    gname  = "treatment_year_reg",
    data   = cat_data,
    alp    = 0.01  # 99% confidence level
  )
  
  # --- Get the dynamic treatment effect estimates ---
  aggte_cat <- aggte(model_cat, type = "dynamic", na.rm = TRUE)
  
  # --- Convert to a tidy data frame ---
  cat_df <- ggdid(aggte_cat)$data
  
  category_results[[cat]] <- cat_df
}

# Combine all category-specific results into one data frame
all_categories_df <- do.call(rbind, category_results)
all_categories_df$category <- c(rep("Low",11), rep("Medium", 11), rep("High",11))

# Plot all categories' dynamic estimates together
ggplot(
  all_categories_df,
  aes(
    x = year,
    y = att,
    ymin = att - c * att.se,
    ymax = att + c * att.se,
    color = category
  )
) +
  geom_pointrange(size = .2) +
  geom_line(linetype = 3) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0, linetype = "dotted") +
  geom_vline(xintercept = 4, linetype = "dotted") +
  theme_bw() +
  labs(x = "Years Since Treatment", y = "Estimate (99% conf. band)", color = "EPA Category") +
  scale_x_continuous(breaks = -10:7) +
  scale_y_continuous(breaks = seq(-.30, .08, 0.04)) +
  coord_cartesian(ylim = c(-.25, .08)) +
  theme(
    legend.background = element_rect(
      fill = "white",
      linewidth = 0.2,
      linetype = "solid",
      colour = "black"
    ),
    legend.position = c(0.9, 0.2),
    legend.title = element_blank()
  ) +
  scale_color_discrete(breaks = c('High', 'Medium', 'Low'))


ggsave("03_figures/appendix/figure_a14.pdf", height = 3, width = 8)

rm(list = ls())

